Main Analysis (Exploratory Data Analysis)
# dropping rows that have NAs in the hammer price column, which indicates that the lots are not sold
# drop the rows that are from doha, as indicated in the data quality analysis
df1 <- art_df %>% drop_na(hammer_price_bp_usd)
df2 <- df1 %>%
filter(abs(df1$hammer_price_bp_usd -
median(df1$hammer_price_bp_usd)) <=3*sd(df1$hammer_price_bp_usd)) %>%
filter(location %in% c("HONG KONG", "NEW YORK", "LONDON", "PARIS", "MILAN","DUBAI","AMSTERDAM"))
Auction Information
Insert elizabet’s part
Do Certain Lot Attributes Result in Higher Price?
Lot Titles
What lots have higher price?
df_wordcloud <- df1[,c("lot_title","hammer_price_bp_usd")]
df_wordcloud <- arrange(df_wordcloud,desc(df_wordcloud$hammer_price_bp_usd))[1:500,]
library(wordcloud)
library(tm)
pal <- brewer.pal(9, "OrRd")
pal <- pal[-(1:3)]
wordcloud(df_wordcloud$lot_title, df_wordcloud$hammer_price_bp_usd, min.freq=500, scale=c(5, .5), random.order = FALSE, random.color = FALSE, colors= pal)
#### What words appear more often in the lot titles?
# collapse the lot_title column by word and count the frequency they appear in the titles.
temp <- paste(df1$lot_title, collapse=' ' )
temp <- tolower(temp)
temp <- gsub(" *\\b[[:alpha:]]{1}\\b *", " ", temp)
temp <- gsub('[[:punct:] ]+',' ',temp)
temp <- as.list(strsplit(temp, " "))
temp <- unlist(temp)[!(unlist(temp) %in% stopwords("english"))]
temp <- unlist(temp)[!(unlist(temp) %in% "na")]
word_count <- na.omit(as.data.frame(table(temp)))
word_count <- arrange(word_count,desc(word_count$Freq))[1:300,]
# visualize word frequencies
pal <- brewer.pal(9, "Dark2")
wordcloud(word_count$temp, word_count$Freq, min.freq =20, scale=c(5, .5), random.order = FALSE, random.color = FALSE, colors= pal)
#### Looking at lots that have name “Untitled”, what price ranges are they in? Is it correlated?
library(vcd)
df1 <- df1 %>%
dplyr::mutate(hammer_price_bp_usd_range = forcats::fct_relevel(hammer_price_bp_usd_range, "<$50,000"))%>%
dplyr::mutate(hammer_price_bp_usd_range = forcats::fct_relevel(hammer_price_bp_usd_range, "<$500,000"))%>%
dplyr::mutate(hammer_price_bp_usd_range = forcats::fct_relevel(hammer_price_bp_usd_range, "$500,000+"))
vcd::mosaic(hammer_price_bp_usd_range~is_untitled, direction = c("v", "h"),df1,
gp = gpar(fill = c("lightyellow", "lightpink")),
labeling = labeling_border(rot_labels = c(0, 45),pos_labels="center"))
#### Are they mostly contemprary?
#untitle_labels = read.csv("untitle_ratio.csv", header=TRUE)
Does the era of the lot affect its price?
df3 <- df2 %>%
filter(df2$birth_year>1800)
ggplot(df3, aes(birth_year,hammer_price_bp_usd)) +
geom_smooth(method='lm',formula=y ~ poly(x, 2))+
geom_point(alpha = .1) +
theme_grey(10)+scale_y_log10()+ geom_density_2d(bins = 5)
#box plot price vs year
#ggplot(df3, aes(x=auth_era), y=hammer_price_bp_usd)+ stat_summary(fun.y="mean", geom="line", aes(group=1))
ggplot(df3, aes(auth_era, hammer_price_bp_usd)) +
geom_boxplot()+theme(axis.text.x = element_text(angle = 90, hjust = 1))+scale_y_log10()
## Do Certain Extrernal Factors Result in Higher Price? ###Does the Order Matter?
ggplot(df2, aes(percent_in_auction,hammer_price_bp_usd)) +
geom_smooth(method='lm',formula=y~x,color="red")+
geom_point(alpha = .05) +
theme_grey(10)+scale_y_log10()+
facet_wrap(~location)
### Is there an impact from the financial crisis?
df1$auc_ymd <- as.Date(df1$auc_year_month_date)
art_yearfin <- df1 %>% group_by(month=lubridate::floor_date(auc_ymd, "month")) %>% summarise(revenue = mean(hammer_price_bp_usd))
ggplot() +geom_line(data=art_yearfin, aes(x=month, y=revenue/1000000))+ggtitle("Financial Crisis' Effect on average lot price ($M)")+ylab("average price")+xlab("Time")+theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_log10()
Let’s start by looking at the average lot prices of Sothebeys on a yearly scale. Our guess is that we are supposed to see a significant drop around the time of the financial crisis.
art_finance <- art_final[c("auc_year", "auc_month", "location","hammer_price_bp_usd")] %>% filter(!is.na(hammer_price_bp_usd))
art_yearfin <- art_finance %>% group_by(auc_year) %>% summarise(revenue = sum(hammer_price_bp_usd))
ggplot(art_yearfin, aes(x=auc_year, y=as.numeric(revenue/1000000))) +geom_line(col="blue")+ggtitle("Financial Crisis' Effect on Auction Revenue ($M)")+ylab("Revenue")+xlab("Year")
Indeed, we are seeing a big deep around late 2010. It is surprising to see that it took some time for the effect to reach the auction houses. Perhaps it is certain locations that are skewing some of the data. Let us try to facet the data by location and see if that could present us with a better outlook.
art_locfin <- art_finance %>% group_by(auc_year, location) %>% summarise(revenue = sum(hammer_price_bp_usd))
ggplot(art_locfin, aes(x=auc_year, y=as.numeric(revenue/1000000))) +geom_line(col="blue")+ggtitle("Financial Crisis' Effect on Auction Revenue ($M)")+ylab("Revenue")+xlab("Year") + facet_wrap(~location, scales ="free_y")
It looks like, indeed, some locations do not have consistent data and might be throwing the calculation off (Dubai, Doha, and Amsterdam). Let us tke a look at the same chart without those locations.
art_locfin <- art_finance %>% group_by(auc_year, location) %>% summarise(revenue = sum(hammer_price_bp_usd)) %>% filter(location %in% c("HONG KONG", "NEW YORK", "LONDON", "PARIS", "MILAN"))
ggplot(art_locfin, aes(x=auc_year, y=as.numeric(revenue/1000000))) +geom_line(col="blue")+ggtitle("Financial Crisis' Effect on Auction Revenue ($M)")+ylab("Revenue")+xlab("Year") + facet_wrap(~location, scales ="free_y")
We notice a consistent drop in revenue across all locations above starting 2010. Therefore, our hypothesys must be correct: the financial crisis did have an effect on the auction revenue across the world (specifically significant drops are observed in New York and Hong Kong).
Does Season Matter?
vcd::mosaic(hammer_price_bp_usd_range~auc_season, direction = c("v", "h"),df1,
gp = gpar(fill = c("lightyellow", "lightpink")),
labeling = labeling_border(rot_labels = c(0, 90),pos_labels="center"))
###Does Location Matter? # Executive Summary